In [1]:
library(readr)
count_matrix <- read_csv("/home/jovyan/GSE164073_Eye_count_matrix.csv")
dim(count_matrix)
head(count_matrix, 3)
Rows: 27946 Columns: 19 ── Column specification ──────────────────────────────────────────────────────── Delimiter: "," chr (1): Gene dbl (18): MW1_cornea_mock_1, MW2_cornea_mock_2, MW3_cornea_mock_3, MW4_corne... ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
- 27946
- 19
| Gene | MW1_cornea_mock_1 | MW2_cornea_mock_2 | MW3_cornea_mock_3 | MW4_cornea_CoV2_1 | MW5_cornea_CoV2_2 | MW6_cornea_CoV2_3 | MW7_limbus_mock_1 | MW8_limbus_mock_2 | MW9_limbus_mock_3 | MW10_limbus_CoV2_1 | MW11_limbus_CoV2_2 | MW12_limbus_CoV2_3 | MW13_sclera_mock_1 | MW14_sclera_mock_2 | MW15_sclera_mock_3 | MW16_sclera_CoV2_1 | MW17_sclera_CoV2_2 | MW18_sclera_CoV2_3 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
| A1BG | 91 | 131 | 86 | 77 | 69 | 72 | 128 | 97 | 112 | 96 | 95 | 82 | 82 | 70 | 101 | 91 | 78 | 84 |
| A1BG-AS1 | 292 | 284 | 271 | 232 | 250 | 241 | 444 | 311 | 355 | 274 | 298 | 322 | 233 | 241 | 243 | 279 | 203 | 278 |
| A1CF | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Preprocessing¶
In [2]:
count_matrix <- as.data.frame(count_matrix) # convert tibble to data.frame
# Set gene symbols as row names
rownames(count_matrix) <- count_matrix$Gene
# Remove the Gene column so only numeric counts remain
count_matrix <- count_matrix[, -which(names(count_matrix) == "Gene")]
# Ensure counts are integers
count_matrix[] <- lapply(count_matrix, function(x) as.integer(round(x)))
# check data
head(count_matrix, 3)
| MW1_cornea_mock_1 | MW2_cornea_mock_2 | MW3_cornea_mock_3 | MW4_cornea_CoV2_1 | MW5_cornea_CoV2_2 | MW6_cornea_CoV2_3 | MW7_limbus_mock_1 | MW8_limbus_mock_2 | MW9_limbus_mock_3 | MW10_limbus_CoV2_1 | MW11_limbus_CoV2_2 | MW12_limbus_CoV2_3 | MW13_sclera_mock_1 | MW14_sclera_mock_2 | MW15_sclera_mock_3 | MW16_sclera_CoV2_1 | MW17_sclera_CoV2_2 | MW18_sclera_CoV2_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | |
| A1BG | 91 | 131 | 86 | 77 | 69 | 72 | 128 | 97 | 112 | 96 | 95 | 82 | 82 | 70 | 101 | 91 | 78 | 84 |
| A1BG-AS1 | 292 | 284 | 271 | 232 | 250 | 241 | 444 | 311 | 355 | 274 | 298 | 322 | 233 | 241 | 243 | 279 | 203 | 278 |
| A1CF | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Create metadata¶
In [3]:
sample_names <- colnames(count_matrix)
# Suppose the 19 samples are in order:
# 6 cornea (3 mock + 3 CoV2), 6 limbus (3 mock + 3 CoV2), 6 sclera (3 mock + 3 CoV2), + 1 extra?
# Adjust as needed to match your dataset
tissue <- c(rep("cornea",6), rep("limbus",6), rep("sclera",6))
condition <- c(rep(c("mock","CoV2"), each=3), rep(c("mock","CoV2"), each=3), rep(c("mock","CoV2"), each=3))
# Build colData
colData <- data.frame(tissue = tissue, condition = condition)
rownames(colData) <- sample_names
# Check
nrow(colData) # should be 19
ncol(count_matrix) # should be 19
colData
18
18
| tissue | condition | |
|---|---|---|
| <chr> | <chr> | |
| MW1_cornea_mock_1 | cornea | mock |
| MW2_cornea_mock_2 | cornea | mock |
| MW3_cornea_mock_3 | cornea | mock |
| MW4_cornea_CoV2_1 | cornea | CoV2 |
| MW5_cornea_CoV2_2 | cornea | CoV2 |
| MW6_cornea_CoV2_3 | cornea | CoV2 |
| MW7_limbus_mock_1 | limbus | mock |
| MW8_limbus_mock_2 | limbus | mock |
| MW9_limbus_mock_3 | limbus | mock |
| MW10_limbus_CoV2_1 | limbus | CoV2 |
| MW11_limbus_CoV2_2 | limbus | CoV2 |
| MW12_limbus_CoV2_3 | limbus | CoV2 |
| MW13_sclera_mock_1 | sclera | mock |
| MW14_sclera_mock_2 | sclera | mock |
| MW15_sclera_mock_3 | sclera | mock |
| MW16_sclera_CoV2_1 | sclera | CoV2 |
| MW17_sclera_CoV2_2 | sclera | CoV2 |
| MW18_sclera_CoV2_3 | sclera | CoV2 |
In [4]:
## Check if all column names in count_matrix are in colData or metadata rownames
all(colnames(count_matrix) %in% rownames(colData))
# Check if all colData rownames are in count_matrix columns
all(rownames(colData) %in% colnames(count_matrix))
#Check the order of samples
#DESeq2 requires row order in colData to match column order in count_matrix:
all(rownames(colData) == colnames(count_matrix))
TRUE
TRUE
TRUE
Differential expression analysis using DeSeq2¶
In [49]:
#install.packages("BiocManager")
#BiocManager::install(version = "3.18")
#BiocManager::install("DESeq2")
In [55]:
library(DESeq2)
dds <- DESeqDataSetFromMatrix(
countData = round(count_matrix),
colData = colData,
design = ~ condition
)
converting counts to integer mode Warning message in DESeqDataSet(se, design = design, ignoreRank): “some variables in design formula are characters, converting to factors”
In [56]:
## Filtering before DeSeq2 creating
dds <- dds[rowSums(counts(dds)) >= 10, ] # keep genes with ≥10 counts across all samples
In [57]:
## Run DESeq2
dds <- DESeq(dds)
res <- results(dds)
estimating size factors estimating dispersions gene-wise dispersion estimates mean-dispersion relationship final dispersion estimates fitting model and testing
In [58]:
res
log2 fold change (MLE): condition mock vs CoV2
Wald test p-value: condition mock vs CoV2
DataFrame with 17925 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
A1BG 90.387769 0.1449492 0.117706 1.231452 0.2181537 0.775805
A1BG-AS1 278.108499 0.0490181 0.112583 0.435394 0.6632767 0.952794
A2M 872.917260 0.8098853 0.741530 1.092182 0.2747531 0.812063
A2M-AS1 11.330923 -0.4885154 0.252834 -1.932160 0.0533398 NA
A2ML1 0.578192 1.1192600 1.825458 0.613139 0.5397843 NA
... ... ... ... ... ... ...
ZYG11A 17.652 0.3833286 0.3738318 1.025404 0.3051725 0.835643
ZYG11B 1384.163 -0.0983171 0.1080570 -0.909863 0.3628946 0.868647
ZYX 6717.754 0.2508181 0.1112974 2.253586 0.0242222 0.352814
ZZEF1 1362.867 -0.1285807 0.0928343 -1.385056 0.1660353 0.723257
ZZZ3 1186.311 -0.1190627 0.0749112 -1.589385 0.1119735 0.646802
In [59]:
summary(res)
out of 17925 with nonzero total read count adjusted p-value < 0.1 LFC > 0 (up) : 125, 0.7% LFC < 0 (down) : 183, 1% outliers [1] : 0, 0% low counts [2] : 4518, 25% (mean count < 14) [1] see 'cooksCutoff' argument of ?results [2] see 'independentFiltering' argument of ?results
Filtering¶
In [11]:
# Filter out genes with significant p-values (padj < 0.05)
significant_genes <- res[!is.na(res$padj) & res$padj < 0.05, ]
# Identify upregulated genes (log2FoldChange > 0)
upregulated_genes <- significant_genes[significant_genes$log2FoldChange > 0, ]
# Identify downregulated genes (log2FoldChange < 0)
downregulated_genes <- significant_genes[significant_genes$log2FoldChange < 0, ]
# Get the total number of upregulated and downregulated genes
num_upregulated <- nrow(upregulated_genes)
num_downregulated <- nrow(downregulated_genes)
# Print the results
cat("Number of upregulated genes:", num_upregulated, "\n")
cat("Number of downregulated genes:", num_downregulated, "\n")
Number of upregulated genes: 83 Number of downregulated genes: 146
MA plot¶
In [12]:
plotMA(res, ylim = c(-5,5))
#In DESeq2, the MA plot is often used to visualize the effect of shrinkage estimators and to identify genes with significant differential expression, which are typically highlighted in color.
PCA plot¶
In [13]:
# Perform variance stabilizing transformation (VST) on DESeqDataSet (dds)
vsd <- vst(dds, blind = FALSE)
# Plot PCA without the table, remove grid lines, add axis lines, and include ticks
library(ggplot2)
# Generate the PCA plot without the data table
plotPCA(vsd, intgroup = "tissue", returnData = FALSE) +
theme_minimal() + # Minimal theme for a clean look
theme(
axis.text = element_text(size = 12), # Larger axis text
axis.title = element_text(size = 14), # Larger axis titles
legend.title = element_text(size = 12), # Adjust legend title size
legend.text = element_text(size = 10), # Adjust legend text size
panel.grid = element_blank(), # Remove grid lines
axis.line = element_line(size = 1, color = "black"), # Add axis lines
axis.ticks = element_line(size = 1, color = "black"), # Add axis ticks
axis.ticks.length = unit(0.07, "inches")
) +
labs(
title = "Principal Component Analysis (PCA) of Gene Expression",
x = "Principal Component 1 (PC1)", # Label for x-axis
y = "Principal Component 2 (PC2)", # Label for y-axis
color = "Tissue Type"
) +
theme(legend.position = "right") # Position legend at the top
using ntop=500 top features by variance
Warning message:
“The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.”
Plotting VolcanoPlot¶
In [14]:
library(dplyr)
library(ggplot2)
library(ggrepel)
# Convert results to a data frame
res_df <- as.data.frame(res)
# Clean up results
res_df_clean <- res_df[!is.na(res_df$padj), ]
# Clean DESeq2 results
res_df_clean <- res_df[!is.na(res_df$padj), ]
# Add regulation status and convert to factor
res_df_clean <- res_df_clean %>%
mutate(
Regulation = case_when(
log2FoldChange > 0.58 & padj < 0.05 ~ "Up",
log2FoldChange < -0.58 & padj < 0.05 ~ "Down",
TRUE ~ "Not Significant" # Keep "Not Significant" for the rest
),
Regulation = factor(Regulation, levels = c("Down", "Not Significant", "Up")) # Define the factor levels
)
# Filter points above -log10(padj) > 50
res_df_clean <- res_df_clean %>%
filter(-log10(padj) <= 50)
# Select top 5 Up and Down genes
top_up <- res_df_clean %>% filter(Regulation == "Up") %>% arrange(padj) %>% slice_head(n = 5)
top_down <- res_df_clean %>% filter(Regulation == "Down") %>% arrange(padj) %>% slice_head(n = 5)
top_genes <- bind_rows(top_up, top_down)
# Volcano plot with enhanced aesthetics
volcano = ggplot(res_df_clean, aes(x = log2FoldChange, y = -log10(padj), color = Regulation)) +
geom_point(alpha = 0.65, size = 1.5) + # Keep only one geom_point() layer
scale_color_manual(
values = c("Down" = "blue", "Not Significant" = "grey", "Up" = "red"), # Color for all 3 categories
breaks = c("Down", "Not Significant", "Up") # Ensure all categories are in the legend
) +
geom_vline(xintercept = c(-0.58, 0.58), linetype = "dashed", color = "black") + # threshold lines
geom_hline(yintercept = 1.2, linetype = "dashed", color = "black") + # significance threshold
geom_text_repel(data = top_genes, aes(label = rownames(top_genes)), size = 3, max.overlaps = 50, show.legend = FALSE) + # Disable legend for text labels
theme_minimal() +
theme(
axis.line.y.left = element_line(color = "black", size = 0.8), # Solid left y-axis
axis.line.x = element_line(color = "black", linewidth = 0.8), # Solid x-axis
axis.line.y.right = element_blank(), # Remove right y-axis
legend.position = c(0.8, 0.8), # Custom position for the legend (X = 0.8, Y = 0.8)
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 10),
legend.key.height = unit(1.5, "lines"),
legend.key.width = unit(2, "lines"),
legend.spacing.y = unit(0.5, "cm"), # Increase space between items in the legend
legend.box.spacing = unit(0.5, "cm"), # Adjust space around the legend box
axis.ticks.length = unit(0.1, "cm"), # Increase the tick length
axis.ticks = element_line(color = "black", size = 0.8), # Black color and size for ticks
axis.ticks.length.x = unit(0.15, "cm"),
axis.ticks.length.y = unit(0.15, "cm"),
panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank() # Remove minor grid lines
) +
labs(
x = "log2 FC",
y = "-log10 Adjusted p-value"
)
# Display the plot
volcano
Attaching package: ‘dplyr’
The following object is masked from ‘package:Biobase’:
combine
The following object is masked from ‘package:matrixStats’:
count
The following objects are masked from ‘package:GenomicRanges’:
intersect, setdiff, union
The following object is masked from ‘package:GenomeInfoDb’:
intersect
The following objects are masked from ‘package:IRanges’:
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from ‘package:S4Vectors’:
first, intersect, rename, setdiff, setequal, union
The following objects are masked from ‘package:BiocGenerics’:
combine, intersect, setdiff, union
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Warning message:
“A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
3.5.0.
ℹ Please use the `legend.position.inside` argument of `theme()` instead.”
Heatmap¶
In [15]:
install.packages("pheatmap")
Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’ (as ‘lib’ is unspecified)
In [16]:
# Normalize counts using DESeq2
log_norm_counts <- log2(counts(dds, normalized = TRUE) + 1)
# Select top 50 variable genes
library(matrixStats)
top_var_genes <- head(order(rowVars(log_norm_counts), decreasing = TRUE), 50)
mat <- log_norm_counts[top_var_genes, ]
# Publication-ready color palette
pub_palette <- colorRampPalette(c("#5D3A9B", "white", "#D7191C"))(100)
# Draw heatmap
library(pheatmap)
heatmap <- pheatmap(
mat,
scale = "row",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "complete",
color = pub_palette,
show_rownames = TRUE,
show_colnames = TRUE,
fontsize_row = 8,
fontsize_col = 10,
border_color = NA,
main = "Top 50 Variable Genes"
)
# Save plot
ggsave("heatmap.png", plot = heatmap, width = 10, height = 12)
heatmap
GO Enrichment Analysis¶
In [17]:
# Filter the results to get significant DEGs (adjusted p-value < 0.05)
deg <- res[which(res$padj < 0.05), ]
deg_genes <- rownames(deg)
In [18]:
install.packages("devtools")
Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’ (as ‘lib’ is unspecified)
In [19]:
devtools::install_github("ctlab/fgsea")
devtools::install_github("GuangchuangYu/DOSE")
devtools::install_github("YuLab-SMU/enrichplot")
devtools::install_github("YuLab-SMU/clusterProfiler")
Skipping install of 'fgsea' from a github remote, the SHA1 (d6f1670c) has not changed since last install. Use `force = TRUE` to force installation Downloading GitHub repo GuangchuangYu/DOSE@HEAD
Skipping 15 packages ahead of CRAN: zlibbioc, GenomeInfoDbData, GenomeInfoDb, XVector, Biostrings, KEGGREST, S4Vectors, IRanges, Biobase, BiocGenerics, AnnotationDbi, GO.db, BiocParallel, qvalue, GOSemSim
── R CMD build ───────────────────────────────────────────────────────────────── * checking for file ‘/tmp/RtmpXTSN2B/remotesd66057730c/YuLab-SMU-DOSE-34a6365/DESCRIPTION’ ... OK * preparing ‘DOSE’: * checking DESCRIPTION meta-information ... OK * checking for LF line-endings in source and make files and shell scripts * checking for empty or unneeded directories * looking to see if a ‘data/datalist’ file should be added * building ‘DOSE_4.3.0.tar.gz’
Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’ (as ‘lib’ is unspecified) Warning message in i.p(...): “installation of package ‘/tmp/RtmpXTSN2B/filed6a08fb2f/DOSE_4.3.0.tar.gz’ had non-zero exit status” Downloading GitHub repo YuLab-SMU/enrichplot@HEAD
Skipping 17 packages ahead of CRAN: zlibbioc, GenomeInfoDbData, GenomeInfoDb, XVector, Biostrings, KEGGREST, S4Vectors, IRanges, Biobase, BiocGenerics, AnnotationDbi, GO.db, BiocParallel, treeio, qvalue, GOSemSim, ggtree
── R CMD build ───────────────────────────────────────────────────────────────── * checking for file ‘/tmp/RtmpXTSN2B/remotesd63488ce6/YuLab-SMU-enrichplot-bf86dcd/DESCRIPTION’ ... OK * preparing ‘enrichplot’: * checking DESCRIPTION meta-information ... OK * checking for LF line-endings in source and make files and shell scripts * checking for empty or unneeded directories * building ‘enrichplot_1.29.2.001.tar.gz’
Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’ (as ‘lib’ is unspecified) Warning message in i.p(...): “installation of package ‘/tmp/RtmpXTSN2B/filed69eb5168/enrichplot_1.29.2.001.tar.gz’ had non-zero exit status” Skipping install of 'clusterProfiler' from a github remote, the SHA1 (39395ce1) has not changed since last install. Use `force = TRUE` to force installation
In [20]:
# Install Bioconductor package manager if it's not already installed
install.packages("BiocManager")
# Install org.Hs.eg.db
BiocManager::install("org.Hs.eg.db")
Installing package into ‘/home/jovyan/R/x86_64-conda-linux-gnu-library/4.3’
(as ‘lib’ is unspecified)
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
CRAN: https://cran.r-project.org
Bioconductor version 3.18 (BiocManager 1.30.26), R 4.3.3 (2024-02-29)
Warning message:
“package(s) not installed when version(s) same as or greater than current; use
`force = TRUE` to re-install: 'org.Hs.eg.db'”
Old packages: 'DOSE', 'enrichplot', 'AER', 'av', 'boot', 'broom', 'checkmate',
'collections', 'commonmark', 'cowplot', 'crosstalk', 'curl', 'data.table',
'Deriv', 'dials', 'doBy', 'doFuture', 'evaluate', 'future', 'future.apply',
'gert', 'GGally', 'gganimate', 'gghighlight', 'ggpubr', 'ggridges',
'ggstats', 'glmnet', 'hardhat', 'haven', 'httpuv', 'httr2', 'infer', 'later',
'mlr', 'modeldata', 'ndjson', 'parallelly', 'patchwork', 'pbkrtest',
'pillar', 'plotly', 'pROC', 'promises', 'purrr', 'qpdf', 'quanteda', 'Rcpp',
'RcppArmadillo', 'readtext', 'reticulate', 'RODBC', 'rprojroot', 'rsample',
'rvest', 's2', 'sf', 'shiny', 'tibble', 'tidytext', 'units', 'usethis',
'utf8', 'V8', 'waldo', 'workflows', 'xfun', 'XML', 'xml2'
In [41]:
# Ensure plots render inline
options(jupyter.plot_mimetypes = c("image/png"))
In [48]:
library(clusterProfiler)
library(org.Hs.eg.db) # Load for human, adjust based on your organism
# Perform GO enrichment analysis (Biological Process - BP, Molecular Function - MF, Cellular Component - CC)
ego <- enrichGO(
gene = deg_genes, # DEGs (gene symbols)
OrgDb = org.Hs.eg.db, # Organism database
keyType = "SYMBOL", # Gene identifier type (e.g., SYMBOL, ENTREZID)
ont = "BP", # Biological Process ontology (can be changed to MF or CC)
pvalueCutoff = 0.05, # Adjusted p-value threshold
qvalueCutoff = 0.2 # FDR threshold (optional)
)
# View the results
#head(ego)
barplot(ego, color = "p.adjust", showCategory = 10)
In [ ]: